Concrete Dataset Analysis¶
In [1]:
#%pip install pandas numpy matplotlib seaborn missingno autogluon shapley
In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
# sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
%matplotlib inline
EDA¶
In [3]:
df = pd.read_csv("dataset.csv", sep=";").reset_index(drop=True)
In [4]:
df.drop(columns=["Unnamed: 0", "id"], inplace=True)
In [5]:
df.head()
Out[5]:
| CementComponent | BlastFurnaceSlag | FlyAshComponent | WaterComponent | SuperplasticizerComponent | CoarseAggregateComponent | FineAggregateComponent | AgeInDays | Strength | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 525.0 | 0.0 | 0.0 | 186.0 | 0.0 | 1125.0 | 613.0 | 3.0 | 10.38 |
| 1 | 143.0 | 169.0 | 143.0 | 191.0 | 8.0 | 967.0 | 643.0 | 28.0 | 23.52 |
| 2 | 289.0 | 134.7 | 0.0 | 185.7 | 0.0 | 1075.0 | 795.3 | 28.0 | 36.96 |
| 3 | 304.0 | 76.0 | 0.0 | 228.0 | 0.0 | 932.0 | 670.0 | 365.0 | 39.05 |
| 4 | 157.0 | 236.0 | 0.0 | 192.0 | 0.0 | 935.4 | 781.2 | NaN | 74.19 |
In [6]:
df.columns
Out[6]:
Index(['CementComponent', 'BlastFurnaceSlag', 'FlyAshComponent',
'WaterComponent', 'SuperplasticizerComponent',
'CoarseAggregateComponent', 'FineAggregateComponent', 'AgeInDays',
'Strength'],
dtype='object')
In [7]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5407 entries, 0 to 5406 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CementComponent 5194 non-null float64 1 BlastFurnaceSlag 5079 non-null float64 2 FlyAshComponent 5258 non-null float64 3 WaterComponent 5044 non-null float64 4 SuperplasticizerComponent 5178 non-null float64 5 CoarseAggregateComponent 5095 non-null float64 6 FineAggregateComponent 5133 non-null float64 7 AgeInDays 5062 non-null float64 8 Strength 5407 non-null float64 dtypes: float64(9) memory usage: 380.3 KB
In [8]:
df.describe()
Out[8]:
| CementComponent | BlastFurnaceSlag | FlyAshComponent | WaterComponent | SuperplasticizerComponent | CoarseAggregateComponent | FineAggregateComponent | AgeInDays | Strength | |
|---|---|---|---|---|---|---|---|---|---|
| count | 5194.000000 | 5079.000000 | 5258.000000 | 5044.000000 | 5178.000000 | 5095.000000 | 5133.000000 | 5062.000000 | 5407.000000 |
| mean | 299.073585 | 58.557531 | 31.976949 | 185.067407 | 4.120614 | 991.653185 | 771.449834 | 51.344528 | 35.452071 |
| std | 105.800354 | 83.286366 | 54.667373 | 18.510484 | 5.686415 | 77.165369 | 78.510465 | 69.752855 | 16.401896 |
| min | 102.000000 | 0.000000 | 0.000000 | 121.800000 | 0.000000 | 801.000000 | 594.000000 | 1.000000 | 2.330000 |
| 25% | 213.500000 | 0.000000 | 0.000000 | 175.100000 | 0.000000 | 938.200000 | 734.300000 | 7.000000 | 23.640000 |
| 50% | 297.200000 | 0.000000 | 0.000000 | 187.400000 | 0.000000 | 978.000000 | 781.200000 | 28.000000 | 33.950000 |
| 75% | 375.000000 | 122.600000 | 79.000000 | 192.000000 | 8.100000 | 1047.000000 | 821.000000 | 56.000000 | 45.850000 |
| max | 540.000000 | 359.400000 | 200.100000 | 247.000000 | 32.200000 | 1145.000000 | 992.600000 | 365.000000 | 82.600000 |
In [9]:
# missing values
df.isnull().sum()
Out[9]:
CementComponent 213 BlastFurnaceSlag 328 FlyAshComponent 149 WaterComponent 363 SuperplasticizerComponent 229 CoarseAggregateComponent 312 FineAggregateComponent 274 AgeInDays 345 Strength 0 dtype: int64
In [10]:
# percentuale di missing values in una notazione percentuale
df.isnull().sum() / len(df) *100
Out[10]:
CementComponent 3.939338 BlastFurnaceSlag 6.066210 FlyAshComponent 2.755687 WaterComponent 6.713520 SuperplasticizerComponent 4.235251 CoarseAggregateComponent 5.770298 FineAggregateComponent 5.067505 AgeInDays 6.380618 Strength 0.000000 dtype: float64
todo trovare cosa fare con i missing values¶
In [11]:
msno.matrix(df)
msno.heatmap(df)
Out[11]:
<Axes: >
proviamo con o un simple imputer usando la mediana dato che gli histogram sono skewed o con un knn che dovrebbe essere robusto
In [12]:
import missingno as msno
msno.bar(df)
msno.matrix(df)
msno.heatmap(df)
msno.dendrogram(df)
msno.dendrogram(df, figsize=(10, 10))
msno.dendrogram(df, figsize=(10, 10))
msno.dendrogram(df, figsize=(10, 10))
Out[12]:
<Axes: >
In [13]:
# histogram for each feature
for feature in df.columns:
sns.histplot(df[feature])
plt.show()
try to impute with knn
In [ ]:
from sklearn.impute import KNNImputer
# copy dataframe and run KNN imputation
df_imputed_knn = df.copy()
imputer = KNNImputer(n_neighbors=5, weights='distance')
df_imputed_knn = pd.DataFrame(imputer.fit_transform(df_imputed_knn),
columns=df.columns, index=df.index)
# save imputed copy
df_imputed_knn.to_csv("dataset_imputed_knn.csv", index=False)
# quick check
print("Missing before:\n", df.isnull().sum())
print("\nMissing after (KNN imputed):\n", df_imputed_knn.isnull().sum())
In [23]:
# median before and after
print("Median before:\n", df.median())
print("\nMedian after (KNN imputed):\n", df_imputed_knn.median())
Median before: CementComponent 297.20 BlastFurnaceSlag 0.00 FlyAshComponent 0.00 WaterComponent 187.40 SuperplasticizerComponent 0.00 CoarseAggregateComponent 978.00 FineAggregateComponent 781.20 AgeInDays 28.00 Strength 33.95 dtype: float64 Median after (KNN imputed): CementComponent 297.200000 BlastFurnaceSlag 0.000000 FlyAshComponent 0.000000 WaterComponent 188.448466 SuperplasticizerComponent 0.000000 CoarseAggregateComponent 978.000000 FineAggregateComponent 781.200000 AgeInDays 28.000000 Strength 33.950000 dtype: float64
In [37]:
# ANALISI OUTLIERS - Comprehensive Outlier Detection
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.ensemble import IsolationForest
print("="*70)
print("OUTLIER ANALYSIS - Concrete Dataset")
print("="*70)
# Use the imputed dataset
df_analysis = df_imputed_knn.copy()
# 1. STATISTICAL SUMMARY
print("\n" + "="*70)
print("1. STATISTICAL SUMMARY")
print("="*70)
print(df_analysis.describe())
# 2. Z-SCORE METHOD (outliers > 3 standard deviations)
print("\n" + "="*70)
print("2. Z-SCORE METHOD (|z| > 3)")
print("="*70)
z_scores = pd.DataFrame(np.abs(stats.zscore(df_analysis)), columns=df_analysis.columns, index=df_analysis.index)
outliers_zscore = (z_scores > 3)
print("\nOutliers per feature (Z-score > 3):")
for col in df_analysis.columns:
count = outliers_zscore[col].sum()
percentage = (count / len(df_analysis)) * 100
print(f"{col:30s}: {count:4d} outliers ({percentage:5.2f}%)")
print(f"\nTotal outlier data points: {outliers_zscore.sum().sum()}")
rows_with_outliers = outliers_zscore.any(axis=1).sum()
print(f"Rows containing at least one outlier: {rows_with_outliers} ({rows_with_outliers/len(df_analysis)*100:.2f}%)")
# 3. IQR METHOD (Interquartile Range)
print("\n" + "="*70)
print("3. IQR METHOD (beyond 1.5*IQR)")
print("="*70)
outliers_iqr = pd.DataFrame(index=df_analysis.index)
outliers_count_iqr = {}
iqr_bounds = {}
for col in df_analysis.columns:
Q1 = df_analysis[col].quantile(0.25)
Q3 = df_analysis[col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers_iqr[col] = (df_analysis[col] < lower_bound) | (df_analysis[col] > upper_bound)
outliers_count_iqr[col] = outliers_iqr[col].sum()
iqr_bounds[col] = (lower_bound, upper_bound)
print(f"{col:30s}: {outliers_count_iqr[col]:4d} outliers ({outliers_count_iqr[col]/len(df_analysis)*100:5.2f}%) "
f"[{lower_bound:8.2f}, {upper_bound:8.2f}]")
rows_with_outliers_iqr = outliers_iqr.any(axis=1).sum()
print(f"\nRows containing at least one outlier: {rows_with_outliers_iqr} ({rows_with_outliers_iqr/len(df_analysis)*100:.2f}%)")
# 4. VISUALIZATION - BOXPLOTS
print("\n" + "="*70)
print("4. BOXPLOT VISUALIZATION")
print("="*70)
fig, axes = plt.subplots(3, 3, figsize=(18, 12))
axes = axes.ravel()
for idx, col in enumerate(df_analysis.columns):
ax = axes[idx]
bp = ax.boxplot(df_analysis[col], vert=True, patch_artist=True,
boxprops=dict(facecolor='lightblue', alpha=0.7),
medianprops=dict(color='red', linewidth=2),
whiskerprops=dict(color='blue', linewidth=1.5),
capprops=dict(color='blue', linewidth=1.5),
flierprops=dict(marker='o', markerfacecolor='red', markersize=6, alpha=0.5))
ax.set_title(f'{col}\n({outliers_count_iqr[col]} outliers)', fontsize=10, fontweight='bold')
ax.set_ylabel('Value')
ax.grid(axis='y', alpha=0.3)
plt.suptitle('Outlier Detection - Boxplots for All Features', fontsize=16, fontweight='bold', y=1.00)
plt.tight_layout()
plt.show()
# 5. HISTOGRAM + DISTRIBUTION
print("\n" + "="*70)
print("5. DISTRIBUTION ANALYSIS")
print("="*70)
fig, axes = plt.subplots(3, 3, figsize=(18, 12))
axes = axes.ravel()
for idx, col in enumerate(df_analysis.columns):
ax = axes[idx]
# Histogram
ax.hist(df_analysis[col], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
# Get bounds
lower_bound, upper_bound = iqr_bounds[col]
ax.axvline(lower_bound, color='red', linestyle='--', linewidth=2, label=f'Lower: {lower_bound:.2f}')
ax.axvline(upper_bound, color='red', linestyle='--', linewidth=2, label=f'Upper: {upper_bound:.2f}')
ax.axvline(df_analysis[col].median(), color='green', linestyle='-', linewidth=2, label=f'Median: {df_analysis[col].median():.2f}')
ax.set_title(f'{col}', fontsize=10, fontweight='bold')
ax.set_xlabel('Value')
ax.set_ylabel('Frequency')
ax.legend(fontsize=7)
ax.grid(alpha=0.3)
plt.suptitle('Distribution Analysis with Outlier Boundaries', fontsize=16, fontweight='bold', y=1.00)
plt.tight_layout()
plt.show()
# 6. SCATTER PLOTS - Outliers vs Strength
print("\n" + "="*70)
print("6. FEATURE vs STRENGTH (identifying influential outliers)")
print("="*70)
fig, axes = plt.subplots(3, 3, figsize=(18, 12))
axes = axes.ravel()
for idx, col in enumerate(df_analysis.columns):
if col == 'Strength':
axes[idx].axis('off')
continue
ax = axes[idx]
# Regular points
mask_normal = ~outliers_iqr[col]
ax.scatter(df_analysis.loc[mask_normal, col],
df_analysis.loc[mask_normal, 'Strength'],
alpha=0.5, s=20, c='blue', label='Normal')
# Outlier points
mask_outlier = outliers_iqr[col]
if mask_outlier.sum() > 0:
ax.scatter(df_analysis.loc[mask_outlier, col],
df_analysis.loc[mask_outlier, 'Strength'],
alpha=0.7, s=50, c='red', marker='x', label='Outlier')
ax.set_xlabel(col, fontsize=9)
ax.set_ylabel('Strength', fontsize=9)
ax.set_title(f'{col} vs Strength', fontsize=10, fontweight='bold')
ax.legend(fontsize=7)
ax.grid(alpha=0.3)
plt.suptitle('Features vs Target - Highlighting Outliers', fontsize=16, fontweight='bold', y=1.00)
plt.tight_layout()
plt.show()
# 7. MULTIVARIATE OUTLIERS - Isolation Forest
print("\n" + "="*70)
print("7. MULTIVARIATE OUTLIER DETECTION - Isolation Forest")
print("="*70)
# Isolation Forest
iso_forest = IsolationForest(contamination=0.1, random_state=42)
outliers_iso = iso_forest.fit_predict(df_analysis)
outliers_iso_mask = outliers_iso == -1
print(f"Outliers detected by Isolation Forest: {outliers_iso_mask.sum()} ({outliers_iso_mask.sum()/len(df_analysis)*100:.2f}%)")
# Visualize anomaly scores
anomaly_scores = iso_forest.score_samples(df_analysis)
plt.figure(figsize=(12, 6))
plt.hist(anomaly_scores, bins=50, alpha=0.7, color='purple', edgecolor='black')
plt.axvline(np.percentile(anomaly_scores, 10), color='red', linestyle='--', linewidth=2, label='10th percentile (outlier threshold)')
plt.xlabel('Anomaly Score', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title('Isolation Forest - Anomaly Score Distribution', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(alpha=0.3)
plt.show()
# 8. SUMMARY TABLE
print("\n" + "="*70)
print("8. OUTLIER DETECTION SUMMARY")
print("="*70)
summary_data = {
'Method': ['Z-Score (|z| > 3)', 'IQR (1.5*IQR)', 'Isolation Forest'],
'Outlier Rows': [rows_with_outliers, rows_with_outliers_iqr, outliers_iso_mask.sum()],
'Percentage': [
f"{rows_with_outliers/len(df_analysis)*100:.2f}%",
f"{rows_with_outliers_iqr/len(df_analysis)*100:.2f}%",
f"{outliers_iso_mask.sum()/len(df_analysis)*100:.2f}%"
]
}
summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))
# 9. CREATE DATASETS WITH/WITHOUT OUTLIERS
print("\n" + "="*70)
print("9. CREATING CLEANED DATASETS")
print("="*70)
# Dataset without IQR outliers
df_no_outliers_iqr = df_analysis[~outliers_iqr.any(axis=1)].copy()
print(f"Original dataset: {len(df_analysis)} rows")
print(f"Dataset without IQR outliers: {len(df_no_outliers_iqr)} rows (removed {len(df_analysis) - len(df_no_outliers_iqr)} rows)")
# Dataset without Isolation Forest outliers
df_no_outliers_iso = df_analysis[~outliers_iso_mask].copy()
print(f"Dataset without Isolation Forest outliers: {len(df_no_outliers_iso)} rows (removed {len(df_analysis) - len(df_no_outliers_iso)} rows)")
# 10. RECOMMENDATION
print("\n" + "="*70)
print("10. RECOMMENDATIONS")
print("="*70)
print("""
Based on the outlier analysis:
1. IQR Method: Most sensitive - detects feature-wise outliers
2. Z-Score: Conservative - only extreme statistical outliers
3. Isolation Forest: Detects multivariate outliers (anomalies in feature combinations)
NEXT STEPS:
✓ Created df_no_outliers_iqr (without IQR outliers)
✓ Created df_no_outliers_iso (without Isolation Forest outliers)
RECOMMENDATIONS:
- Test models on BOTH datasets (with and without outliers)
- Tree-based models (Random Forest, Gradient Boosting) handle outliers well
- Linear models are sensitive to outliers - consider using cleaned data
- Consider capping extreme values instead of removing them
""")
======================================================================
OUTLIER ANALYSIS - Concrete Dataset
======================================================================
======================================================================
1. STATISTICAL SUMMARY
======================================================================
CementComponent BlastFurnaceSlag FlyAshComponent WaterComponent \
count 5407.000000 5407.000000 5407.000000 5407.000000
mean 299.030418 58.251910 31.895506 185.117472
std 105.107863 82.576886 54.478730 18.346339
min 102.000000 0.000000 0.000000 121.800000
25% 213.700000 0.000000 0.000000 175.500000
50% 297.200000 0.000000 0.000000 188.448466
75% 375.000000 121.000000 79.000000 192.000000
max 540.000000 359.400000 200.100000 247.000000
SuperplasticizerComponent CoarseAggregateComponent \
count 5407.000000 5407.000000
mean 4.100082 992.359368
std 5.664028 76.453119
min 0.000000 801.000000
25% 0.000000 940.000000
50% 0.000000 978.000000
75% 8.000000 1047.000000
max 32.200000 1145.000000
FineAggregateComponent AgeInDays Strength
count 5407.000000 5407.000000 5407.000000
mean 771.266154 52.027313 35.452071
std 78.081840 69.179240 16.401896
min 594.000000 1.000000 2.330000
25% 734.300000 7.000000 23.640000
50% 781.200000 28.000000 33.950000
75% 821.000000 56.000000 45.850000
max 992.600000 365.000000 82.600000
======================================================================
2. Z-SCORE METHOD (|z| > 3)
======================================================================
Outliers per feature (Z-score > 3):
CementComponent : 0 outliers ( 0.00%)
BlastFurnaceSlag : 20 outliers ( 0.37%)
FlyAshComponent : 5 outliers ( 0.09%)
WaterComponent : 34 outliers ( 0.63%)
SuperplasticizerComponent : 69 outliers ( 1.28%)
CoarseAggregateComponent : 0 outliers ( 0.00%)
FineAggregateComponent : 0 outliers ( 0.00%)
AgeInDays : 193 outliers ( 3.57%)
Strength : 0 outliers ( 0.00%)
Total outlier data points: 321
Rows containing at least one outlier: 307 (5.68%)
======================================================================
3. IQR METHOD (beyond 1.5*IQR)
======================================================================
CementComponent : 0 outliers ( 0.00%) [ -28.25, 616.95]
BlastFurnaceSlag : 37 outliers ( 0.68%) [ -181.50, 302.50]
FlyAshComponent : 5 outliers ( 0.09%) [ -118.50, 197.50]
WaterComponent : 473 outliers ( 8.75%) [ 150.75, 216.75]
SuperplasticizerComponent : 72 outliers ( 1.33%) [ -12.00, 20.00]
CoarseAggregateComponent : 0 outliers ( 0.00%) [ 779.50, 1207.50]
FineAggregateComponent : 146 outliers ( 2.70%) [ 604.25, 951.05]
AgeInDays : 471 outliers ( 8.71%) [ -66.50, 129.50]
Strength : 33 outliers ( 0.61%) [ -9.67, 79.16]
Rows containing at least one outlier: 912 (16.87%)
======================================================================
4. BOXPLOT VISUALIZATION
======================================================================
====================================================================== 5. DISTRIBUTION ANALYSIS ======================================================================
====================================================================== 6. FEATURE vs STRENGTH (identifying influential outliers) ======================================================================
====================================================================== 7. MULTIVARIATE OUTLIER DETECTION - Isolation Forest ====================================================================== Outliers detected by Isolation Forest: 541 (10.01%)
======================================================================
8. OUTLIER DETECTION SUMMARY
======================================================================
Method Outlier Rows Percentage
Z-Score (|z| > 3) 307 5.68%
IQR (1.5*IQR) 912 16.87%
Isolation Forest 541 10.01%
======================================================================
9. CREATING CLEANED DATASETS
======================================================================
Original dataset: 5407 rows
Dataset without IQR outliers: 4495 rows (removed 912 rows)
Dataset without Isolation Forest outliers: 4866 rows (removed 541 rows)
======================================================================
10. RECOMMENDATIONS
======================================================================
Based on the outlier analysis:
1. IQR Method: Most sensitive - detects feature-wise outliers
2. Z-Score: Conservative - only extreme statistical outliers
3. Isolation Forest: Detects multivariate outliers (anomalies in feature combinations)
NEXT STEPS:
✓ Created df_no_outliers_iqr (without IQR outliers)
✓ Created df_no_outliers_iso (without Isolation Forest outliers)
RECOMMENDATIONS:
- Test models on BOTH datasets (with and without outliers)
- Tree-based models (Random Forest, Gradient Boosting) handle outliers well
- Linear models are sensitive to outliers - consider using cleaned data
- Consider capping extreme values instead of removing them
In [24]:
# check distributions before and after imputation
for feature in df.columns:
sns.kdeplot(df[feature], label='Original', color='blue')
sns.kdeplot(df_imputed_knn[feature], label='KNN Imputed', color='orange')
plt.title(f'Distribution of {feature} before and after KNN Imputation')
plt.legend()
plt.show()
Feature engineering:¶
In [45]:
# Feature Engineering - Create new features for concrete strength prediction
import numpy as np
import pandas as pd
# Start with the imputed dataset
df_engineered = df_imputed_knn.copy()
print("Creating engineered features...")
print(f"Original features: {df_engineered.shape[1]}")
# 1. RAPPORTI TRA COMPONENTI (cruciali per la chimica del calcestruzzo)
# Water-to-Cement ratio (W/C) - uno dei fattori più importanti per la resistenza
df_engineered['Water_Cement_Ratio'] = df_engineered['WaterComponent'] / (df_engineered['CementComponent'] + 1e-10)
# Total Binder (materiali cementizi totali)
df_engineered['Total_Binder'] = (df_engineered['CementComponent'] +
df_engineered['BlastFurnaceSlag'] +
df_engineered['FlyAshComponent'])
# Water-to-Binder ratio (W/B)
df_engineered['Water_Binder_Ratio'] = df_engineered['WaterComponent'] / (df_engineered['Total_Binder'] + 1e-10)
# Fine-to-Coarse Aggregate ratio
df_engineered['Fine_Coarse_Ratio'] = df_engineered['FineAggregateComponent'] / (df_engineered['CoarseAggregateComponent'] + 1e-10)
# Superplasticizer-to-Cement ratio
df_engineered['Plasticizer_Cement_Ratio'] = df_engineered['SuperplasticizerComponent'] / (df_engineered['CementComponent'] + 1e-10)
# Supplementary Cementitious Materials (SCM) to Cement ratio
df_engineered['SCM_Cement_Ratio'] = (df_engineered['BlastFurnaceSlag'] + df_engineered['FlyAshComponent']) / (df_engineered['CementComponent'] + 1e-10)
# 2. SOMME E TOTALI
# Total Aggregate
df_engineered['Total_Aggregate'] = df_engineered['CoarseAggregateComponent'] + df_engineered['FineAggregateComponent']
# Total Weight (peso totale mix)
df_engineered['Total_Weight'] = (df_engineered['CementComponent'] +
df_engineered['BlastFurnaceSlag'] +
df_engineered['FlyAshComponent'] +
df_engineered['WaterComponent'] +
df_engineered['SuperplasticizerComponent'] +
df_engineered['CoarseAggregateComponent'] +
df_engineered['FineAggregateComponent'])
# Paste Volume (rapporto pasta cementizia vs aggregati)
df_engineered['Paste_Volume'] = (df_engineered['Total_Binder'] + df_engineered['WaterComponent']) / (df_engineered['Total_Weight'] + 1e-10)
# 3. PERCENTUALI (composizione del mix)
df_engineered['Cement_Percentage'] = df_engineered['CementComponent'] / (df_engineered['Total_Weight'] + 1e-10) * 100
df_engineered['Water_Percentage'] = df_engineered['WaterComponent'] / (df_engineered['Total_Weight'] + 1e-10) * 100
df_engineered['Aggregate_Percentage'] = df_engineered['Total_Aggregate'] / (df_engineered['Total_Weight'] + 1e-10) * 100
df_engineered['Binder_Percentage'] = df_engineered['Total_Binder'] / (df_engineered['Total_Weight'] + 1e-10) * 100
# 4. FEATURE TEMPORALI (età del calcestruzzo)
# La resistenza del calcestruzzo cresce in modo logaritmico nel tempo
df_engineered['Age_Log'] = np.log1p(df_engineered['AgeInDays']) # log(1 + x) per gestire 0
df_engineered['Age_Sqrt'] = np.sqrt(df_engineered['AgeInDays'])
df_engineered['Age_Squared'] = df_engineered['AgeInDays'] ** 2
df_engineered['Age_Cubed'] = df_engineered['AgeInDays'] ** 3
# Binning dell'età (categorizzazione)
df_engineered['Age_Young'] = (df_engineered['AgeInDays'] <= 7).astype(int) # 0-7 giorni
df_engineered['Age_Medium'] = ((df_engineered['AgeInDays'] > 7) & (df_engineered['AgeInDays'] <= 28)).astype(int) # 7-28 giorni
df_engineered['Age_Old'] = (df_engineered['AgeInDays'] > 28).astype(int) # 28+ giorni
# 5. FEATURE BINARIE (presenza/assenza di componenti)
df_engineered['Has_FlyAsh'] = (df_engineered['FlyAshComponent'] > 0).astype(int)
df_engineered['Has_BlastFurnace'] = (df_engineered['BlastFurnaceSlag'] > 0).astype(int)
df_engineered['Has_Plasticizer'] = (df_engineered['SuperplasticizerComponent'] > 0).astype(int)
# 6. INTERAZIONI TRA COMPONENTI CHIAVE
# Cement x Age (il cemento guadagna resistenza nel tempo)
df_engineered['Cement_Age_Interaction'] = df_engineered['CementComponent'] * df_engineered['Age_Log']
# Water x Age
df_engineered['Water_Age_Interaction'] = df_engineered['WaterComponent'] * df_engineered['Age_Log']
# Binder x Age
df_engineered['Binder_Age_Interaction'] = df_engineered['Total_Binder'] * df_engineered['Age_Log']
# Water/Cement x Age
df_engineered['WC_Age_Interaction'] = df_engineered['Water_Cement_Ratio'] * df_engineered['Age_Log']
# 7. FEATURE QUADRATICHE (per catturare relazioni non lineari)
df_engineered['Cement_Squared'] = df_engineered['CementComponent'] ** 2
df_engineered['Water_Squared'] = df_engineered['WaterComponent'] ** 2
df_engineered['WC_Ratio_Squared'] = df_engineered['Water_Cement_Ratio'] ** 2
# 8. DENSITÀ E COMPATTEZZA
# Aggregate density
df_engineered['Aggregate_Density'] = df_engineered['Total_Aggregate'] / (df_engineered['Total_Weight'] + 1e-10)
# Binder efficiency (binder per unit of water)
df_engineered['Binder_Efficiency'] = df_engineered['Total_Binder'] / (df_engineered['WaterComponent'] + 1e-10)
# 9. COMBINAZIONI DI MATERIALI SUPPLEMENTARI
# Total SCM (Supplementary Cementitious Materials)
df_engineered['Total_SCM'] = df_engineered['BlastFurnaceSlag'] + df_engineered['FlyAshComponent']
df_engineered['SCM_Percentage'] = df_engineered['Total_SCM'] / (df_engineered['Total_Weight'] + 1e-10) * 100
# Fly Ash to Blast Furnace ratio
df_engineered['FlyAsh_BlastFurnace_Ratio'] = df_engineered['FlyAshComponent'] / (df_engineered['BlastFurnaceSlag'] + 1e-10)
print(f"Engineered features: {df_engineered.shape[1]}")
print(f"New features created: {df_engineered.shape[1] - df_imputed_knn.shape[1]}")
print("\nNew features list:")
new_features = [col for col in df_engineered.columns if col not in df_imputed_knn.columns]
for i, feat in enumerate(new_features, 1):
print(f"{i}. {feat}")
# Visualize correlation of new features with Strength
print("\n" + "="*70)
print("CORRELATION OF NEW FEATURES WITH STRENGTH (Top 15)")
print("="*70)
new_features_corr = df_engineered[new_features + ['Strength']].corr()['Strength'].drop('Strength').sort_values(ascending=False)
print(new_features_corr.head(15))
# Visualize
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(12, 8))
top_features = new_features_corr.head(20)
colors = ['green' if x > 0 else 'red' for x in top_features.values]
plt.barh(range(len(top_features)), top_features.values, color=colors, alpha=0.7)
plt.yticks(range(len(top_features)), top_features.index)
plt.xlabel('Correlation with Strength', fontsize=12, fontweight='bold')
plt.title('Top 20 New Features - Correlation with Concrete Strength', fontsize=14, fontweight='bold')
plt.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()
Creating engineered features... Original features: 9 Engineered features: 44 New features created: 35 New features list: 1. Water_Cement_Ratio 2. Total_Binder 3. Water_Binder_Ratio 4. Fine_Coarse_Ratio 5. Plasticizer_Cement_Ratio 6. SCM_Cement_Ratio 7. Total_Aggregate 8. Total_Weight 9. Paste_Volume 10. Cement_Percentage 11. Water_Percentage 12. Aggregate_Percentage 13. Binder_Percentage 14. Age_Log 15. Age_Sqrt 16. Age_Squared 17. Age_Cubed 18. Age_Young 19. Age_Medium 20. Age_Old 21. Has_FlyAsh 22. Has_BlastFurnace 23. Has_Plasticizer 24. Cement_Age_Interaction 25. Water_Age_Interaction 26. Binder_Age_Interaction 27. WC_Age_Interaction 28. Cement_Squared 29. Water_Squared 30. WC_Ratio_Squared 31. Aggregate_Density 32. Binder_Efficiency 33. Total_SCM 34. SCM_Percentage 35. FlyAsh_BlastFurnace_Ratio ====================================================================== CORRELATION OF NEW FEATURES WITH STRENGTH (Top 15) ====================================================================== Binder_Age_Interaction 0.582406 Age_Log 0.553642 Water_Age_Interaction 0.474287 Cement_Age_Interaction 0.473223 Age_Sqrt 0.458170 Age_Old 0.430362 Binder_Efficiency 0.253757 Binder_Percentage 0.238454 Total_Binder 0.237595 WC_Age_Interaction 0.230582 Paste_Volume 0.214263 Age_Squared 0.180880 Has_Plasticizer 0.174939 Cement_Squared 0.159666 Cement_Percentage 0.153418 Name: Strength, dtype: float64
In [ ]:
# redo the correlation matrix
#plt.figure(figsize=(10, 8))
#sns.heatmap(df_engineered.corr(), annot=True, fmt=".2f", cmap='coolwarm')
#plt.title("Correlation Matrix after KNN Imputation")
Out[ ]:
Text(0.5, 1.0, 'Correlation Matrix after KNN Imputation')
In [52]:
import xgboost as xgb
In [53]:
# Create different predictors using sklearn and compare results on MSE and R2 score and use SHAP to explain models
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import shap
import pandas as pd
import matplotlib.pyplot as plt
# Try to import XGBoost
try:
import xgboost as xgb
XGBOOST_AVAILABLE = True
print("✓ XGBoost is available")
except ImportError:
XGBOOST_AVAILABLE = False
print("⚠ XGBoost not installed. Install with: pip install xgboost")
# Prepare data
X = df_imputed_knn.drop(columns=["Strength"])
y = df_imputed_knn["Strength"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Define models to compare
models = {
'Decision Tree': DecisionTreeRegressor(random_state=42),
'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
'AdaBoost': AdaBoostRegressor(n_estimators=100, random_state=42),
'Linear Regression': LinearRegression(),
'Ridge': Ridge(random_state=42),
'Lasso': Lasso(random_state=42),
'ElasticNet': ElasticNet(random_state=42),
'SVR': SVR(),
'KNN': KNeighborsRegressor(n_neighbors=5)
}
# Add XGBoost if available
if XGBOOST_AVAILABLE:
models['XGBoost'] = xgb.XGBRegressor(n_estimators=100, random_state=42, verbosity=0)
print("✓ XGBoost added to model comparison")
# Store results
results = []
# Train and evaluate each model
for name, model in models.items():
print(f"\n{'='*50}")
print(f"Training {name}...")
print(f"{'='*50}")
# Train model
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Store results
results.append({
'Model': name,
'MSE': mse,
'RMSE': rmse,
'MAE': mae,
'R2 Score': r2
})
print(f"Mean Squared Error: {mse:.4f}")
print(f"Root Mean Squared Error: {rmse:.4f}")
print(f"Mean Absolute Error: {mae:.4f}")
print(f"R^2 Score: {r2:.4f}")
# Create comparison dataframe
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('R2 Score', ascending=False)
print("\n" + "="*70)
print("MODEL COMPARISON - Sorted by R2 Score")
print("="*70)
print(results_df.to_string(index=False))
# Visualize comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
# R2 Score comparison
results_df.plot(x='Model', y='R2 Score', kind='barh', ax=axes[0, 0], legend=False, color='skyblue')
axes[0, 0].set_title('R² Score Comparison (Higher is Better)', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('R² Score')
axes[0, 0].grid(axis='x', alpha=0.3)
# RMSE comparison
results_df.plot(x='Model', y='RMSE', kind='barh', ax=axes[0, 1], legend=False, color='salmon')
axes[0, 1].set_title('RMSE Comparison (Lower is Better)', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('RMSE')
axes[0, 1].grid(axis='x', alpha=0.3)
# MSE comparison
results_df.plot(x='Model', y='MSE', kind='barh', ax=axes[1, 0], legend=False, color='lightgreen')
axes[1, 0].set_title('MSE Comparison (Lower is Better)', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('MSE')
axes[1, 0].grid(axis='x', alpha=0.3)
# MAE comparison
results_df.plot(x='Model', y='MAE', kind='barh', ax=axes[1, 1], legend=False, color='plum')
axes[1, 1].set_title('MAE Comparison (Lower is Better)', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('MAE')
axes[1, 1].grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()
✓ XGBoost is available
✓ XGBoost added to model comparison
==================================================
Training Decision Tree...
==================================================
Mean Squared Error: 231.3182
Root Mean Squared Error: 15.2091
Mean Absolute Error: 11.4152
R^2 Score: 0.1499
==================================================
Training Random Forest...
==================================================
Mean Squared Error: 156.2972
Root Mean Squared Error: 12.5019
Mean Absolute Error: 9.7730
R^2 Score: 0.4256
==================================================
Training Gradient Boosting...
==================================================
Mean Squared Error: 140.5207
Root Mean Squared Error: 11.8541
Mean Absolute Error: 9.2800
R^2 Score: 0.4836
==================================================
Training AdaBoost...
==================================================
Mean Squared Error: 163.9486
Root Mean Squared Error: 12.8042
Mean Absolute Error: 10.2720
R^2 Score: 0.3975
==================================================
Training Linear Regression...
==================================================
Mean Squared Error: 209.1183
Root Mean Squared Error: 14.4609
Mean Absolute Error: 11.5011
R^2 Score: 0.2315
==================================================
Training Ridge...
==================================================
Mean Squared Error: 209.1183
Root Mean Squared Error: 14.4609
Mean Absolute Error: 11.5011
R^2 Score: 0.2315
==================================================
Training Lasso...
==================================================
Mean Squared Error: 209.2316
Root Mean Squared Error: 14.4648
Mean Absolute Error: 11.5141
R^2 Score: 0.2311
==================================================
Training ElasticNet...
==================================================
Mean Squared Error: 209.1796
Root Mean Squared Error: 14.4630
Mean Absolute Error: 11.5089
R^2 Score: 0.2313
==================================================
Training SVR...
==================================================
Mean Squared Error: 222.5505
Root Mean Squared Error: 14.9181
Mean Absolute Error: 11.9723
R^2 Score: 0.1822
==================================================
Training KNN...
==================================================
Mean Squared Error: 186.1852
Root Mean Squared Error: 13.6450
Mean Absolute Error: 10.6636
R^2 Score: 0.3158
==================================================
Training XGBoost...
==================================================
Mean Squared Error: 159.3072
Root Mean Squared Error: 12.6217
Mean Absolute Error: 9.8598
R^2 Score: 0.4146
======================================================================
MODEL COMPARISON - Sorted by R2 Score
======================================================================
Model MSE RMSE MAE R2 Score
Gradient Boosting 140.520654 11.854141 9.279957 0.483606
Random Forest 156.297150 12.501886 9.773029 0.425629
XGBoost 159.307239 12.621697 9.859786 0.414568
AdaBoost 163.948580 12.804241 10.271958 0.397511
KNN 186.185244 13.644971 10.663616 0.315795
Ridge 209.118292 14.460923 11.501069 0.231519
Linear Regression 209.118292 14.460923 11.501068 0.231519
ElasticNet 209.179574 14.463042 11.508877 0.231294
Lasso 209.231627 14.464841 11.514120 0.231102
SVR 222.550541 14.918128 11.972333 0.182157
Decision Tree 231.318230 15.209150 11.415223 0.149937
Hyperparameter tuning¶
In [50]:
# HYPERPARAMETER TUNING - Gradient Boosting Optimization
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import time
print("="*70)
print("GRADIENT BOOSTING - HYPERPARAMETER TUNING")
print("="*70)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\nTraining set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Features: {X_train.shape[1]}")
# ============================================================================
# STEP 1: Baseline Model (default parameters)
# ============================================================================
print("\n" + "="*70)
print("STEP 1: BASELINE MODEL (default parameters)")
print("="*70)
baseline_model = GradientBoostingRegressor(random_state=42)
baseline_model.fit(X_train, y_train)
y_pred_baseline = baseline_model.predict(X_test)
baseline_mse = mean_squared_error(y_test, y_pred_baseline)
baseline_rmse = np.sqrt(baseline_mse)
baseline_mae = mean_absolute_error(y_test, y_pred_baseline)
baseline_r2 = r2_score(y_test, y_pred_baseline)
print(f"\nBaseline Results:")
print(f" MSE: {baseline_mse:.4f}")
print(f" RMSE: {baseline_rmse:.4f}")
print(f" MAE: {baseline_mae:.4f}")
print(f" R²: {baseline_r2:.4f}")
# ============================================================================
# STEP 2: Randomized Search (fast exploration)
# ============================================================================
print("\n" + "="*70)
print("STEP 2: RANDOMIZED SEARCH (exploring parameter space)")
print("="*70)
# Define parameter distribution for random search
param_distributions = {
'n_estimators': [100, 200, 300, 500, 800, 1000],
'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
'max_depth': [3, 4, 5, 6, 7, 8, 10],
'min_samples_split': [2, 5, 10, 20],
'min_samples_leaf': [1, 2, 4, 8],
'subsample': [0.7, 0.8, 0.9, 1.0],
'max_features': ['sqrt', 'log2', None]
}
print("\nParameter space:")
for param, values in param_distributions.items():
print(f" {param:20s}: {values}")
print(f"\nRunning RandomizedSearchCV with 50 iterations and 5-fold CV...")
print("This may take several minutes...\n")
start_time = time.time()
random_search = RandomizedSearchCV(
estimator=GradientBoostingRegressor(random_state=42),
param_distributions=param_distributions,
n_iter=50, # Try 50 random combinations
cv=5, # 5-fold cross-validation
scoring='r2',
n_jobs=-1, # Use all CPU cores
random_state=42,
verbose=1
)
random_search.fit(X_train, y_train)
random_search_time = time.time() - start_time
print(f"\n✓ RandomizedSearchCV completed in {random_search_time:.2f} seconds")
print(f"\nBest parameters found:")
for param, value in random_search.best_params_.items():
print(f" {param:20s}: {value}")
print(f"\nBest CV R² score: {random_search.best_score_:.4f}")
# Evaluate on test set
y_pred_random = random_search.predict(X_test)
random_mse = mean_squared_error(y_test, y_pred_random)
random_rmse = np.sqrt(random_mse)
random_mae = mean_absolute_error(y_test, y_pred_random)
random_r2 = r2_score(y_test, y_pred_random)
print(f"\nTest set results (RandomizedSearch best model):")
print(f" MSE: {random_mse:.4f}")
print(f" RMSE: {random_rmse:.4f}")
print(f" MAE: {random_mae:.4f}")
print(f" R²: {random_r2:.4f}")
# ============================================================================
# STEP 3: Grid Search (fine-tuning around best parameters)
# ============================================================================
print("\n" + "="*70)
print("STEP 3: GRID SEARCH (fine-tuning best parameters)")
print("="*70)
# Create a narrow grid around the best parameters
best_params = random_search.best_params_
# Define narrow ranges around best values
grid_param = {
'n_estimators': [max(100, best_params['n_estimators'] - 100),
best_params['n_estimators'],
best_params['n_estimators'] + 100],
'learning_rate': [max(0.01, best_params['learning_rate'] - 0.02),
best_params['learning_rate'],
min(0.3, best_params['learning_rate'] + 0.02)],
'max_depth': [max(3, best_params['max_depth'] - 1),
best_params['max_depth'],
best_params['max_depth'] + 1],
'min_samples_split': [best_params['min_samples_split']],
'min_samples_leaf': [best_params['min_samples_leaf']],
'subsample': [best_params['subsample']],
'max_features': [best_params['max_features']]
}
print("\nFine-tuning grid:")
for param, values in grid_param.items():
print(f" {param:20s}: {values}")
print(f"\nRunning GridSearchCV with 5-fold CV...")
print("This may take a few minutes...\n")
start_time = time.time()
grid_search = GridSearchCV(
estimator=GradientBoostingRegressor(random_state=42),
param_grid=grid_param,
cv=5,
scoring='r2',
n_jobs=-1,
verbose=1
)
grid_search.fit(X_train, y_train)
grid_search_time = time.time() - start_time
print(f"\n✓ GridSearchCV completed in {grid_search_time:.2f} seconds")
print(f"\nFinal best parameters:")
for param, value in grid_search.best_params_.items():
print(f" {param:20s}: {value}")
print(f"\nBest CV R² score: {grid_search.best_score_:.4f}")
# Evaluate on test set
y_pred_grid = grid_search.predict(X_test)
grid_mse = mean_squared_error(y_test, y_pred_grid)
grid_rmse = np.sqrt(grid_mse)
grid_mae = mean_absolute_error(y_test, y_pred_grid)
grid_r2 = r2_score(y_test, y_pred_grid)
print(f"\nTest set results (GridSearch best model):")
print(f" MSE: {grid_mse:.4f}")
print(f" RMSE: {grid_rmse:.4f}")
print(f" MAE: {grid_mae:.4f}")
print(f" R²: {grid_r2:.4f}")
# ============================================================================
# STEP 4: Comparison and Visualization
# ============================================================================
print("\n" + "="*70)
print("STEP 4: MODEL COMPARISON")
print("="*70)
comparison_data = {
'Model': ['Baseline (default)', 'RandomizedSearch', 'GridSearch (final)'],
'MSE': [baseline_mse, random_mse, grid_mse],
'RMSE': [baseline_rmse, random_rmse, grid_rmse],
'MAE': [baseline_mae, random_mae, grid_mae],
'R² Score': [baseline_r2, random_r2, grid_r2]
}
comparison_df = pd.DataFrame(comparison_data)
print("\n" + comparison_df.to_string(index=False))
# Calculate improvements
r2_improvement = ((grid_r2 - baseline_r2) / baseline_r2) * 100
rmse_improvement = ((baseline_rmse - grid_rmse) / baseline_rmse) * 100
print(f"\n📈 IMPROVEMENTS:")
print(f" R² improvement: {r2_improvement:+.2f}%")
print(f" RMSE improvement: {rmse_improvement:+.2f}%")
# Visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
# R² comparison
axes[0, 0].bar(comparison_df['Model'], comparison_df['R² Score'], color=['gray', 'skyblue', 'green'], alpha=0.7)
axes[0, 0].set_title('R² Score Comparison', fontsize=12, fontweight='bold')
axes[0, 0].set_ylabel('R² Score')
axes[0, 0].grid(axis='y', alpha=0.3)
axes[0, 0].set_ylim(0, 1)
for i, v in enumerate(comparison_df['R² Score']):
axes[0, 0].text(i, v + 0.02, f'{v:.4f}', ha='center', fontweight='bold')
# RMSE comparison
axes[0, 1].bar(comparison_df['Model'], comparison_df['RMSE'], color=['gray', 'skyblue', 'green'], alpha=0.7)
axes[0, 1].set_title('RMSE Comparison (Lower is Better)', fontsize=12, fontweight='bold')
axes[0, 1].set_ylabel('RMSE')
axes[0, 1].grid(axis='y', alpha=0.3)
for i, v in enumerate(comparison_df['RMSE']):
axes[0, 1].text(i, v + 0.2, f'{v:.2f}', ha='center', fontweight='bold')
# Actual vs Predicted (GridSearch model)
axes[1, 0].scatter(y_test, y_pred_grid, alpha=0.5, s=30, color='green')
axes[1, 0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2, label='Perfect prediction')
axes[1, 0].set_xlabel('Actual Strength', fontsize=11)
axes[1, 0].set_ylabel('Predicted Strength', fontsize=11)
axes[1, 0].set_title(f'Actual vs Predicted (Final Model)\nR² = {grid_r2:.4f}', fontsize=12, fontweight='bold')
axes[1, 0].legend()
axes[1, 0].grid(alpha=0.3)
# Residuals plot
residuals = y_test - y_pred_grid
axes[1, 1].scatter(y_pred_grid, residuals, alpha=0.5, s=30, color='purple')
axes[1, 1].axhline(y=0, color='r', linestyle='--', lw=2)
axes[1, 1].set_xlabel('Predicted Strength', fontsize=11)
axes[1, 1].set_ylabel('Residuals', fontsize=11)
axes[1, 1].set_title('Residual Plot (Final Model)', fontsize=12, fontweight='bold')
axes[1, 1].grid(alpha=0.3)
plt.tight_layout()
plt.show()
# ============================================================================
# STEP 5: Feature Importance (Final Model)
# ============================================================================
print("\n" + "="*70)
print("STEP 5: FEATURE IMPORTANCE (Final Model)")
print("="*70)
feature_importance = pd.DataFrame({
'Feature': X_train.columns,
'Importance': grid_search.best_estimator_.feature_importances_
}).sort_values('Importance', ascending=False)
print("\n" + feature_importance.to_string(index=False))
# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'], color='teal', alpha=0.7)
plt.xlabel('Importance', fontsize=12, fontweight='bold')
plt.title('Feature Importance - Optimized Gradient Boosting', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()
# ============================================================================
# STEP 6: Save Best Model
# ============================================================================
print("\n" + "="*70)
print("STEP 6: BEST MODEL SUMMARY")
print("="*70)
best_model = grid_search.best_estimator_
print(f"\n✓ Best Gradient Boosting Model:")
print(f" - R² Score: {grid_r2:.4f}")
print(f" - RMSE: {grid_rmse:.4f}")
print(f" - MAE: {grid_mae:.4f}")
print(f"\n✓ Model saved as: best_model")
print(f"\nTo use this model for predictions:")
print(f" predictions = best_model.predict(new_data)")
print("\n" + "="*70)
print("HYPERPARAMETER TUNING COMPLETE!")
print("="*70)
======================================================================
GRADIENT BOOSTING - HYPERPARAMETER TUNING
======================================================================
Training set: 4325 samples
Test set: 1082 samples
Features: 43
======================================================================
STEP 1: BASELINE MODEL (default parameters)
======================================================================
Baseline Results:
MSE: 140.6489
RMSE: 11.8595
MAE: 9.2451
R²: 0.4831
======================================================================
STEP 2: RANDOMIZED SEARCH (exploring parameter space)
======================================================================
Parameter space:
n_estimators : [100, 200, 300, 500, 800, 1000]
learning_rate : [0.01, 0.05, 0.1, 0.15, 0.2]
max_depth : [3, 4, 5, 6, 7, 8, 10]
min_samples_split : [2, 5, 10, 20]
min_samples_leaf : [1, 2, 4, 8]
subsample : [0.7, 0.8, 0.9, 1.0]
max_features : ['sqrt', 'log2', None]
Running RandomizedSearchCV with 50 iterations and 5-fold CV...
This may take several minutes...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
✓ RandomizedSearchCV completed in 211.88 seconds
Best parameters found:
subsample : 0.7
n_estimators : 300
min_samples_split : 10
min_samples_leaf : 8
max_features : log2
max_depth : 3
learning_rate : 0.05
Best CV R² score: 0.4393
Test set results (RandomizedSearch best model):
MSE: 140.1357
RMSE: 11.8379
MAE: 9.2414
R²: 0.4850
======================================================================
STEP 3: GRID SEARCH (fine-tuning best parameters)
======================================================================
Fine-tuning grid:
n_estimators : [200, 300, 400]
learning_rate : [0.030000000000000002, 0.05, 0.07]
max_depth : [3, 3, 4]
min_samples_split : [10]
min_samples_leaf : [8]
subsample : [0.7]
max_features : ['log2']
Running GridSearchCV with 5-fold CV...
This may take a few minutes...
Fitting 5 folds for each of 27 candidates, totalling 135 fits
✓ GridSearchCV completed in 13.27 seconds
Final best parameters:
learning_rate : 0.05
max_depth : 3
max_features : log2
min_samples_leaf : 8
min_samples_split : 10
n_estimators : 200
subsample : 0.7
Best CV R² score: 0.4408
Test set results (GridSearch best model):
MSE: 139.6006
RMSE: 11.8153
MAE: 9.2221
R²: 0.4870
======================================================================
STEP 4: MODEL COMPARISON
======================================================================
Model MSE RMSE MAE R² Score
Baseline (default) 140.648870 11.859548 9.245056 0.483135
RandomizedSearch 140.135658 11.837891 9.241365 0.485021
GridSearch (final) 139.600631 11.815271 9.222091 0.486987
📈 IMPROVEMENTS:
R² improvement: +0.80%
RMSE improvement: +0.37%
======================================================================
STEP 5: FEATURE IMPORTANCE (Final Model)
======================================================================
Feature Importance
Age_Cubed 0.198850
Age_Sqrt 0.102132
AgeInDays 0.091048
Water_Age_Interaction 0.075758
Binder_Age_Interaction 0.057597
Age_Young 0.053072
Age_Squared 0.042254
Cement_Age_Interaction 0.030969
Water_Binder_Ratio 0.026563
Age_Log 0.021339
WC_Age_Interaction 0.020321
Cement_Percentage 0.018542
Aggregate_Density 0.016734
Cement_Squared 0.015845
SuperplasticizerComponent 0.015096
Water_Squared 0.013405
Water_Percentage 0.013304
Total_Binder 0.012382
Binder_Efficiency 0.012258
Aggregate_Percentage 0.011553
WaterComponent 0.011433
Age_Medium 0.010439
Has_Plasticizer 0.009954
Fine_Coarse_Ratio 0.009917
FineAggregateComponent 0.009440
CementComponent 0.009254
Total_Weight 0.008516
SCM_Percentage 0.007428
Plasticizer_Cement_Ratio 0.007323
WC_Ratio_Squared 0.007029
Age_Old 0.006670
CoarseAggregateComponent 0.006518
Binder_Percentage 0.006285
Total_SCM 0.006252
FlyAsh_BlastFurnace_Ratio 0.006083
Total_Aggregate 0.005914
Paste_Volume 0.005531
Water_Cement_Ratio 0.004984
BlastFurnaceSlag 0.003315
SCM_Cement_Ratio 0.003227
FlyAshComponent 0.002695
Has_BlastFurnace 0.001829
Has_FlyAsh 0.000940
====================================================================== STEP 6: BEST MODEL SUMMARY ====================================================================== ✓ Best Gradient Boosting Model: - R² Score: 0.4870 - RMSE: 11.8153 - MAE: 9.2221 ✓ Model saved as: best_model To use this model for predictions: predictions = best_model.predict(new_data) ====================================================================== HYPERPARAMETER TUNING COMPLETE! ======================================================================
In [54]:
# save best model for streamlit app
import joblib
joblib.dump(best_model, 'best_gradient_boosting_model.pkl')
Out[54]:
['best_gradient_boosting_model.pkl']
In [56]:
# Save the best model and data for Streamlit app
import pickle
import joblib
print("="*70)
print("SAVING BEST MODEL FOR STREAMLIT APP")
print("="*70)
# Retrain on full data for best performance
best_model.fit(X_train, y_train)
print(f"\nBest Model: Gradient Boosting Regressor Optimized")
print(f"R² Score: {results_df.iloc[0]['R2 Score']:.4f}")
# Save model
joblib.dump(best_model, 'best_concrete_model.pkl')
print(f"\n✓ Model saved as: best_concrete_model.pkl")
# Save feature names
feature_names = X_train.columns.tolist()
with open('feature_names.pkl', 'wb') as f:
pickle.dump(feature_names, f)
print(f"✓ Feature names saved as: feature_names.pkl")
# Save training data for SHAP (sample for speed)
X_train_sample = X_train.sample(n=min(100, len(X_train)), random_state=42)
joblib.dump(X_train_sample, 'X_train_sample.pkl')
print(f"✓ Training sample saved as: X_train_sample.pkl")
# Save statistics for input validation
feature_stats = df_imputed_knn.describe().to_dict()
with open('feature_stats.pkl', 'wb') as f:
pickle.dump(feature_stats, f)
print(f"✓ Feature statistics saved as: feature_stats.pkl")
print("\n" + "="*70)
print("FILES READY FOR STREAMLIT APP")
print("="*70)
print("\nSaved files:")
print(" 1. best_concrete_model.pkl - Trained model")
print(" 2. feature_names.pkl - Feature names")
print(" 3. X_train_sample.pkl - Training data for SHAP")
print(" 4. feature_stats.pkl - Feature statistics")
print("\nNext: Run the Streamlit app with: streamlit run app.")
====================================================================== SAVING BEST MODEL FOR STREAMLIT APP ====================================================================== Best Model: Gradient Boosting Regressor Optimized R² Score: 0.4836 ✓ Model saved as: best_concrete_model.pkl ✓ Feature names saved as: feature_names.pkl ✓ Training sample saved as: X_train_sample.pkl ✓ Feature statistics saved as: feature_stats.pkl ====================================================================== FILES READY FOR STREAMLIT APP ====================================================================== Saved files: 1. best_concrete_model.pkl - Trained model 2. feature_names.pkl - Feature names 3. X_train_sample.pkl - Training data for SHAP 4. feature_stats.pkl - Feature statistics Next: Run the Streamlit app with: streamlit run app.
In [58]:
%pip install nbconvert
Collecting nbconvert Using cached nbconvert-7.16.6-py3-none-any.whl.metadata (8.5 kB) Requirement already satisfied: beautifulsoup4 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from nbconvert) (4.14.2) Requirement already satisfied: bleach!=5.0.0 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from bleach[css]!=5.0.0->nbconvert) (6.3.0) Requirement already satisfied: defusedxml in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from nbconvert) (0.7.1) Requirement already satisfied: jinja2>=3.0 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from nbconvert) (3.1.6) Requirement already satisfied: jupyter-core>=4.7 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from nbconvert) (5.9.1) Requirement already satisfied: jupyterlab-pygments in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from nbconvert) (0.3.0) Requirement already satisfied: markupsafe>=2.0 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from nbconvert) (3.0.3) Requirement already satisfied: mistune<4,>=2.0.3 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from nbconvert) (3.1.4) Collecting nbclient>=0.5.0 (from nbconvert) Using cached nbclient-0.10.2-py3-none-any.whl.metadata (8.3 kB) Collecting nbformat>=5.7 (from nbconvert) Using cached nbformat-5.10.4-py3-none-any.whl.metadata (3.6 kB) Requirement already satisfied: packaging in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from nbconvert) (25.0) Requirement already satisfied: pandocfilters>=1.4.1 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from nbconvert) (1.5.1) Requirement already satisfied: pygments>=2.4.1 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from nbconvert) (2.19.2) Requirement already satisfied: traitlets>=5.1 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from nbconvert) (5.14.3) Requirement already satisfied: webencodings in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from bleach!=5.0.0->bleach[css]!=5.0.0->nbconvert) (0.5.1) Requirement already satisfied: tinycss2<1.5,>=1.1.0 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from bleach[css]!=5.0.0->nbconvert) (1.4.0) Requirement already satisfied: platformdirs>=2.5 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from jupyter-core>=4.7->nbconvert) (4.5.0) Requirement already satisfied: jupyter-client>=6.1.12 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from nbclient>=0.5.0->nbconvert) (8.6.3) Requirement already satisfied: fastjsonschema>=2.15 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from nbformat>=5.7->nbconvert) (2.21.2) Requirement already satisfied: jsonschema>=2.6 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from nbformat>=5.7->nbconvert) (4.23.0) Requirement already satisfied: soupsieve>1.2 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from beautifulsoup4->nbconvert) (2.8) Requirement already satisfied: typing-extensions>=4.0.0 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from beautifulsoup4->nbconvert) (4.15.0) Requirement already satisfied: attrs>=22.2.0 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from jsonschema>=2.6->nbformat>=5.7->nbconvert) (25.4.0) Requirement already satisfied: jsonschema-specifications>=2023.03.6 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from jsonschema>=2.6->nbformat>=5.7->nbconvert) (2025.9.1) Requirement already satisfied: referencing>=0.28.4 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from jsonschema>=2.6->nbformat>=5.7->nbconvert) (0.37.0) Requirement already satisfied: rpds-py>=0.7.1 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from jsonschema>=2.6->nbformat>=5.7->nbconvert) (0.28.0) Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (2.9.0.post0) Requirement already satisfied: pyzmq>=23.0 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (27.1.0) Requirement already satisfied: tornado>=6.2 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (6.5.2) Requirement already satisfied: six>=1.5 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from python-dateutil>=2.8.2->jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (1.17.0) Using cached nbconvert-7.16.6-py3-none-any.whl (258 kB) Using cached nbclient-0.10.2-py3-none-any.whl (25 kB) Using cached nbformat-5.10.4-py3-none-any.whl (78 kB) Installing collected packages: nbformat, nbclient, nbconvert Successfully installed nbclient-0.10.2 nbconvert-7.16.6 nbformat-5.10.4 Note: you may need to restart the kernel to use updated packages.
[notice] A new release of pip is available: 25.0.1 -> 25.3 [notice] To update, run: python.exe -m pip install --upgrade pip
Shap¶
In [40]:
# SHAP Analysis for top 3 models
print("\n" + "="*70)
print("SHAP ANALYSIS - Explaining Model Predictions")
print("="*70)
# Get top 3 models based on R2 score
top_3_models = results_df.head(3)['Model'].tolist()
# Re-train top models and create SHAP explanations
for model_name in top_3_models:
print(f"\n{'='*50}")
print(f"SHAP Analysis for {model_name}")
print(f"{'='*50}")
# Get the model
model = models[model_name]
model.fit(X_train, y_train)
# Create SHAP explainer
try:
if model_name in ['Decision Tree', 'Random Forest', 'Gradient Boosting']:
# Tree-based models can use TreeExplainer
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
else:
# For other models, use KernelExplainer (slower but works for any model)
# Use a smaller background dataset for faster computation
background = shap.sample(X_train, 100)
explainer = shap.KernelExplainer(model.predict, background)
shap_values = explainer.shap_values(X_test[:100]) # Limit test samples for speed
X_test_sample = X_test[:100]
# Summary plot (bar)
plt.figure(figsize=(10, 6))
if model_name in ['Decision Tree', 'Random Forest', 'Gradient Boosting']:
shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)
else:
shap.summary_plot(shap_values, X_test_sample, plot_type="bar", show=False)
plt.title(f'{model_name} - Feature Importance (SHAP)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()
# Summary plot (beeswarm)
plt.figure(figsize=(10, 6))
if model_name in ['Decision Tree', 'Random Forest', 'Gradient Boosting']:
shap.summary_plot(shap_values, X_test, show=False)
else:
shap.summary_plot(shap_values, X_test_sample, show=False)
plt.title(f'{model_name} - Feature Impact on Predictions (SHAP)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()
# Waterfall plot for first prediction
plt.figure(figsize=(10, 6))
if model_name in ['Decision Tree', 'Random Forest', 'Gradient Boosting']:
shap.waterfall_plot(shap.Explanation(values=shap_values[0],
base_values=explainer.expected_value,
data=X_test.iloc[0],
feature_names=X_test.columns.tolist()),
show=False)
else:
shap.waterfall_plot(shap.Explanation(values=shap_values[0],
base_values=explainer.expected_value,
data=X_test_sample.iloc[0],
feature_names=X_test.columns.tolist()),
show=False)
plt.title(f'{model_name} - Single Prediction Explanation (First Test Sample)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()
except Exception as e:
print(f"Error creating SHAP explanation for {model_name}: {str(e)}")
print("Skipping SHAP analysis for this model...")
continue
print("\n" + "="*70)
print("ANALYSIS COMPLETE")
print("="*70)
====================================================================== SHAP ANALYSIS - Explaining Model Predictions ====================================================================== ================================================== SHAP Analysis for Gradient Boosting ==================================================
================================================== SHAP Analysis for Random Forest ==================================================
================================================== SHAP Analysis for AdaBoost ==================================================
100%|██████████| 100/100 [00:03<00:00, 28.22it/s]
====================================================================== ANALYSIS COMPLETE ======================================================================
In [ ]:
import autogluon.features.
autogluon.eda.auto.target_analysis(train_data=df, label="Strength")
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[20], line 2 1 import autogluon ----> 2 autogluon.eda.auto.target_analysis(train_data=df, label="Strength") AttributeError: module 'autogluon' has no attribute 'eda'
In [16]:
import autogluon.eda as auto
auto.target_analysis(train_data=df_train, label=target_col)
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) Cell In[16], line 1 ----> 1 import autogluon.eda as auto 3 auto.target_analysis(train_data=df_train, label=target_col) ModuleNotFoundError: No module named 'autogluon.eda'
Correlazione¶
In [14]:
sns.heatmap(df.corr(), annot=True)
plt.show()
Feature Engineering¶
Train test split and prediction¶
In [24]:
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import train_test_split
# Load your data
data = TabularDataset(df)
# Split the data
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
# Save splits to separate files (optional)
train_data.to_csv('train.csv')
test_data.to_csv('test.csv')
# Define label column
label = 'Strength'
# Train the model
predictor = TabularPredictor(label=label).fit(train_data)
# Make predictions on test data
predictions = predictor.predict(test_data)
# Evaluate performance
performance = predictor.evaluate(test_data)
print(f'Model performance on test data: {performance}')
No path specified. Models will be saved in: "AutogluonModels\ag-20251109_124701"
Verbosity: 2 (Standard Logging)
=================== System Info ===================
AutoGluon Version: 1.4.0
Python Version: 3.12.10
Operating System: Windows
Platform Machine: AMD64
Platform Version: 10.0.26100
CPU Count: 22
Memory Avail: 11.91 GB / 31.47 GB (37.9%)
Disk Space Avail: 560.29 GB / 952.63 GB (58.8%)
===================================================
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
presets='extreme' : New in v1.4: Massively better than 'best' on datasets <30000 samples by using new models meta-learned on https://tabarena.ai: TabPFNv2, TabICL, Mitra, and TabM. Absolute best accuracy. Requires a GPU. Recommended 64 GB CPU memory and 32+ GB GPU memory.
presets='best' : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.
presets='high' : Strong accuracy with fast inference speed.
presets='good' : Good accuracy with very fast inference speed.
presets='medium' : Fast training time, ideal for initial prototyping.
Using hyperparameters preset: hyperparameters='default'
Beginning AutoGluon training ...
AutoGluon will save models to "c:\Users\PietroBoschini\OneDrive - Digital360\Desktop\Github Repos\concrete\concrete\AutogluonModels\ag-20251109_124701"
Train Data Rows: 4325
Train Data Columns: 8
Label Column: Strength
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
Label info (max, min, mean, stddev): (82.6, 2.33, 35.61067, 16.37442)
If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during Predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression', 'quantile'])
Problem Type: regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
Available Memory: 12133.86 MB
Train Data (Original) Memory Usage: 0.26 MB (0.0% of available memory)
Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
Stage 1 Generators:
Fitting AsTypeFeatureGenerator...
Stage 2 Generators:
Fitting FillNaFeatureGenerator...
Stage 3 Generators:
Fitting IdentityFeatureGenerator...
Stage 4 Generators:
Fitting DropUniqueFeatureGenerator...
Stage 5 Generators:
Fitting DropDuplicatesFeatureGenerator...
Types of features in original data (raw dtype, special dtypes):
('float', []) : 8 | ['CementComponent', 'BlastFurnaceSlag', 'FlyAshComponent', 'WaterComponent', 'SuperplasticizerComponent', ...]
Types of features in processed data (raw dtype, special dtypes):
('float', []) : 8 | ['CementComponent', 'BlastFurnaceSlag', 'FlyAshComponent', 'WaterComponent', 'SuperplasticizerComponent', ...]
0.0s = Fit runtime
8 features in original data used to generate 8 features in processed data.
Train Data (Processed) Memory Usage: 0.26 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.03s ...
AutoGluon will gauge predictive performance using evaluation metric: 'root_mean_squared_error'
This metric's sign has been flipped to adhere to being higher_is_better. The metric score can be multiplied by -1 to get the metric value.
To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.11560693641618497, Train Rows: 3825, Val Rows: 500
User-specified model hyperparameters to be fit:
{
'NN_TORCH': [{}],
'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, {'learning_rate': 0.03, 'num_leaves': 128, 'feature_fraction': 0.9, 'min_data_in_leaf': 3, 'ag_args': {'name_suffix': 'Large', 'priority': 0, 'hyperparameter_tune_kwargs': None}}],
'CAT': [{}],
'XGB': [{}],
'FASTAI': [{}],
'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
}
Fitting 9 L1 models, fit_strategy="sequential" ...
Fitting model: LightGBMXT ...
Fitting with cpus=16, gpus=0, mem=0.0/11.9 GB
-12.8225 = Validation score (-root_mean_squared_error)
10.11s = Training runtime
0.01s = Validation runtime
Fitting model: LightGBM ...
Fitting with cpus=16, gpus=0, mem=0.0/11.8 GB
-12.72 = Validation score (-root_mean_squared_error)
1.15s = Training runtime
0.01s = Validation runtime
Fitting model: RandomForestMSE ...
Fitting with cpus=22, gpus=0
-13.5689 = Validation score (-root_mean_squared_error)
1.36s = Training runtime
0.07s = Validation runtime
Fitting model: CatBoost ...
Fitting with cpus=16, gpus=0
-12.6782 = Validation score (-root_mean_squared_error)
5.17s = Training runtime
0.0s = Validation runtime
Fitting model: ExtraTreesMSE ...
Fitting with cpus=22, gpus=0
-13.6565 = Validation score (-root_mean_squared_error)
0.56s = Training runtime
0.06s = Validation runtime
Fitting model: NeuralNetFastAI ...
Fitting with cpus=16, gpus=0, mem=0.0/11.3 GB
-12.7561 = Validation score (-root_mean_squared_error)
4.87s = Training runtime
0.02s = Validation runtime
Fitting model: XGBoost ...
Fitting with cpus=16, gpus=0
-12.8497 = Validation score (-root_mean_squared_error)
0.86s = Training runtime
0.01s = Validation runtime
Fitting model: NeuralNetTorch ...
Fitting with cpus=16, gpus=0, mem=0.0/11.6 GB
c:\Users\PietroBoschini\OneDrive - Digital360\Desktop\Github Repos\concrete\concrete\venv\Lib\site-packages\sklearn\compose\_column_transformer.py:975: FutureWarning: The parameter `force_int_remainder_cols` is deprecated and will be removed in 1.9. It has no effect. Leave it to its default value to avoid this warning.
warnings.warn(
-12.7746 = Validation score (-root_mean_squared_error)
10.16s = Training runtime
0.02s = Validation runtime
Fitting model: LightGBMLarge ...
Fitting with cpus=16, gpus=0, mem=0.0/11.2 GB
-13.1556 = Validation score (-root_mean_squared_error)
2.59s = Training runtime
0.01s = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
Ensemble Weights: {'NeuralNetFastAI': 0.333, 'NeuralNetTorch': 0.292, 'CatBoost': 0.167, 'RandomForestMSE': 0.125, 'LightGBM': 0.083}
-12.5293 = Validation score (-root_mean_squared_error)
0.03s = Training runtime
0.0s = Validation runtime
AutoGluon training complete, total runtime = 37.72s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 4322.3 rows/s (500 batch size)
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("c:\Users\PietroBoschini\OneDrive - Digital360\Desktop\Github Repos\concrete\concrete\AutogluonModels\ag-20251109_124701")
Model performance on test data: {'root_mean_squared_error': np.float64(-12.223556126124398), 'mean_squared_error': -149.4153243685133, 'mean_absolute_error': -9.472742944107477, 'r2': 0.45091903272671363, 'pearsonr': 0.6717917835193142, 'median_absolute_error': -7.4079645538330094}
In [34]:
df['Strength'].describe()
Out[34]:
count 5407.000000 mean 35.452071 std 16.401896 min 2.330000 25% 23.640000 50% 33.950000 75% 45.850000 max 82.600000 Name: Strength, dtype: float64
In [25]:
performance
Out[25]:
{'root_mean_squared_error': np.float64(-12.223556126124398),
'mean_squared_error': -149.4153243685133,
'mean_absolute_error': -9.472742944107477,
'r2': 0.45091903272671363,
'pearsonr': 0.6717917835193142,
'median_absolute_error': -7.4079645538330094}
In [26]:
predictor.leaderboard(test_data, silent=True)
Out[26]:
| model | score_test | score_val | eval_metric | pred_time_test | pred_time_val | fit_time | pred_time_test_marginal | pred_time_val_marginal | fit_time_marginal | stack_level | can_infer | fit_order | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | CatBoost | -12.186064 | -12.678166 | root_mean_squared_error | 0.015701 | 0.000000 | 5.166798 | 0.015701 | 0.000000 | 5.166798 | 1 | True | 4 |
| 1 | WeightedEnsemble_L2 | -12.223556 | -12.529300 | root_mean_squared_error | 0.252403 | 0.115678 | 22.729485 | 0.007087 | 0.000000 | 0.027645 | 2 | True | 10 |
| 2 | LightGBMXT | -12.230122 | -12.822526 | root_mean_squared_error | 0.009604 | 0.009000 | 10.112404 | 0.009604 | 0.009000 | 10.112404 | 1 | True | 1 |
| 3 | LightGBM | -12.352122 | -12.720001 | root_mean_squared_error | 0.002999 | 0.009243 | 1.150662 | 0.002999 | 0.009243 | 1.150662 | 1 | True | 2 |
| 4 | NeuralNetTorch | -12.385383 | -12.774633 | root_mean_squared_error | 0.016525 | 0.016654 | 10.162089 | 0.016525 | 0.016654 | 10.162089 | 1 | True | 8 |
| 5 | XGBoost | -12.453361 | -12.849735 | root_mean_squared_error | 0.023535 | 0.009510 | 0.863003 | 0.023535 | 0.009510 | 0.863003 | 1 | True | 7 |
| 6 | NeuralNetFastAI | -12.608055 | -12.756147 | root_mean_squared_error | 0.029259 | 0.019957 | 4.866769 | 0.029259 | 0.019957 | 4.866769 | 1 | True | 6 |
| 7 | LightGBMLarge | -12.696348 | -13.155580 | root_mean_squared_error | 0.008000 | 0.009005 | 2.585604 | 0.008000 | 0.009005 | 2.585604 | 1 | True | 9 |
| 8 | ExtraTreesMSE | -13.092905 | -13.656455 | root_mean_squared_error | 0.186489 | 0.058352 | 0.555173 | 0.186489 | 0.058352 | 0.555173 | 1 | True | 5 |
| 9 | RandomForestMSE | -13.124074 | -13.568943 | root_mean_squared_error | 0.180832 | 0.069824 | 1.355521 | 0.180832 | 0.069824 | 1.355521 | 1 | True | 3 |
In [ ]:
predictor
In [ ]:
In [ ]:
Shapely plots¶
In [29]:
# install shap in the notebook environment
%pip install -q shap --upgrade# install shap in the notebook environment
Note: you may need to restart the kernel to use updated packages.
Usage: c:\Users\PietroBoschini\OneDrive - Digital360\Desktop\Github Repos\concrete\concrete\venv\Scripts\python.exe -m pip install [options] <requirement specifier> [package-index-options] ... c:\Users\PietroBoschini\OneDrive - Digital360\Desktop\Github Repos\concrete\concrete\venv\Scripts\python.exe -m pip install [options] -r <requirements file> [package-index-options] ... c:\Users\PietroBoschini\OneDrive - Digital360\Desktop\Github Repos\concrete\concrete\venv\Scripts\python.exe -m pip install [options] [-e] <vcs project url> ... c:\Users\PietroBoschini\OneDrive - Digital360\Desktop\Github Repos\concrete\concrete\venv\Scripts\python.exe -m pip install [options] [-e] <local project path> ... c:\Users\PietroBoschini\OneDrive - Digital360\Desktop\Github Repos\concrete\concrete\venv\Scripts\python.exe -m pip install [options] <archive url/path> ... no such option: --upgrade#
In [ ]:
from autogluon.eda import analysis
import os
from IPython.display import IFrame, display
# Run AutoGluon EDA to produce an interactive report for the training data
outdir = "ag_eda_report"
ana = analysis.Analysis(train_data, label=label, problem_type=None, output_dir=outdir)
ana.run()
index_path = os.path.join(outdir, "index.html")
if os.path.exists(index_path):
display(IFrame(index_path, width=1000, height=600))
else:
print(f"EDA report created in: {os.path.abspath(outdir)} (no index.html found)")
In [32]:
%pip install shap
Collecting shap Using cached shap-0.49.1-cp312-cp312-win_amd64.whl.metadata (25 kB) Requirement already satisfied: numpy in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from shap) (2.1.3) Requirement already satisfied: scipy in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from shap) (1.16.3) Requirement already satisfied: scikit-learn in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from shap) (1.7.2) Requirement already satisfied: pandas in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from shap) (2.3.3) Requirement already satisfied: tqdm>=4.27.0 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from shap) (4.67.1) Requirement already satisfied: packaging>20.9 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from shap) (25.0) Requirement already satisfied: slicer==0.0.8 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from shap) (0.0.8) Requirement already satisfied: numba>=0.54 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from shap) (0.62.1) Requirement already satisfied: cloudpickle in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from shap) (3.1.2) Requirement already satisfied: typing-extensions in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from shap) (4.15.0) Requirement already satisfied: llvmlite<0.46,>=0.45.0dev0 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from numba>=0.54->shap) (0.45.1) Requirement already satisfied: colorama in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from tqdm>=4.27.0->shap) (0.4.6) Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from pandas->shap) (2.9.0.post0) Requirement already satisfied: pytz>=2020.1 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from pandas->shap) (2025.2) Requirement already satisfied: tzdata>=2022.7 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from pandas->shap) (2025.2) Requirement already satisfied: joblib>=1.2.0 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from scikit-learn->shap) (1.5.2) Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from scikit-learn->shap) (3.6.0) Requirement already satisfied: six>=1.5 in c:\users\pietroboschini\onedrive - digital360\desktop\github repos\concrete\concrete\venv\lib\site-packages (from python-dateutil>=2.8.2->pandas->shap) (1.17.0) Using cached shap-0.49.1-cp312-cp312-win_amd64.whl (548 kB) Installing collected packages: shap Successfully installed shap-0.49.1 Note: you may need to restart the kernel to use updated packages.
[notice] A new release of pip is available: 25.0.1 -> 25.3 [notice] To update, run: python.exe -m pip install --upgrade pip
In [39]:
import shap
explainer = shap.Explainer("Catboost", train_data.drop(columns=[label]))
shap_values = explainer(test_data.drop(columns=[label]))
shap.summary_plot(shap_values)
shap.summary_plot(shap_values)
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Cell In[39], line 3 1 import shap ----> 3 explainer = shap.Explainer("Catboost", train_data.drop(columns=[label])) 4 shap_values = explainer(test_data.drop(columns=[label])) 5 shap.summary_plot(shap_values) File c:\Users\PietroBoschini\OneDrive - Digital360\Desktop\Github Repos\concrete\concrete\venv\Lib\site-packages\shap\explainers\_explainer.py:204, in Explainer.__init__(self, model, masker, link, algorithm, output_names, feature_names, linearize_link, seed, **kwargs) 200 algorithm = "permutation" 202 # if we get here then we don't know how to handle what was given to us 203 else: --> 204 raise TypeError( 205 "The passed model is not callable and cannot be analyzed directly with the given masker! Model: " 206 + str(model) 207 ) 209 # build the right subclass 210 if algorithm == "exact": TypeError: The passed model is not callable and cannot be analyzed directly with the given masker! Model: Catboost
In [38]:
predictor.model_best
Out[38]:
'WeightedEnsemble_L2'